Created by Holger Buech, Q1/2019
Description
Basic statistics & visualizations for the H-MOG Dataset, especially considering aggregation on sessions and subjects.
Purpose
Data Sources
# Standard
from pathlib import Path
import os
import sys
import datetime
# Extra
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
import matplotlib.cm as cm
import matplotlib.style as style
import seaborn as sns
# `DatasetLoader` is a custom helper class to retrieve data from hdf5 file
module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
sys.path.append(module_path)
from src.utility.dataset_loader_hdf5 import DatasetLoader
%matplotlib inline
%run utils.ipynb
utils_set_output_style()
# Various Settings
TABLE_NAME = "sensors_100hz" # Table with raw sensor data
HMOG_HDF5 = Path.cwd().parent / "data" / "processed" / "hmog_dataset.hdf5"
SEED = 712
FEATURE_COLS = [
"acc_x",
"acc_y",
"acc_z",
"gyr_x",
"gyr_y",
"gyr_z",
"mag_x",
"mag_y",
"mag_z",
]
REPORT_PATH = Path.cwd().parent / "reports" / "figures" # Figures for thesis
REPORT_PATH.mkdir(parents=True, exist_ok=True)
hmog = DatasetLoader(
hdf5_file=HMOG_HDF5,
table_name=TABLE_NAME,
max_subjects=None,
task_types=[],
exclude_subjects=[],
exclude_cols=[],
seed=SEED,
)
hmog.data_summary()
hmog.all.info()
hmog.all.head()
hmog.all.describe(include="all", percentiles=[])
hmog.all.isna().sum()
# Cast subjects to categorical for plotting & memory saving
hmog.all["subject"] = hmog.all["subject"].astype("category")
Interpretation:
np.random.seed(SEED)
session = np.random.choice(hmog.all["session"].unique())
session
df_single_session = hmog.all[hmog.all["session"] == session].copy()
df_single_session["Session Time"] = pd.to_datetime(df_single_session['sys_time'], unit='ms')
df_single_session = df_single_session.set_index("Session Time")
df_single_session.head()
print("Session duration according to timestamps:")
df_single_session.index.max() - df_single_session.index.min()
print("Session duration value count & frequency:")
sec = datetime.timedelta(seconds=len(df_single_session) / 100)
str(sec)
print(f"Task Type: {int(df_single_session.task_type.unique()[0])}")
print("where\n[1, 3, 5] are sitting \n[2, 4, 6] are walking")
plot_rows = [
["acc_x", "acc_y", "acc_z"],
["gyr_x", "gyr_y", "gyr_z"],
["mag_x", "mag_y", "mag_z"],
]
f, axes = plt.subplots(3, 1, dpi=180, figsize=(5.473, 2))
for i, row in enumerate(plot_rows):
g = df_single_session[row].plot(linewidth=0.3, ax=axes[i], sharex="col")
g.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.0)
utils_save_plot(plt, REPORT_PATH / "buech2019-hmog-session-sample.png")
df_temp = hmog.all[FEATURE_COLS]
df_temp.columns = ["$" + c.capitalize() + "$" for c in df_temp.columns]
f, axes = plt.subplots(3, 3, sharex="col", sharey="col", dpi=180, figsize=(5.473, 2))
f.subplots_adjust(hspace=0.6, wspace=0.2)
cmap = cm.get_cmap("tab10")
for i, col in enumerate(df_temp.columns):
plot_column = int(i // 3)
plot_row = i - plot_column * 3
g = sns.distplot(
df_temp[col],
kde=False,
ax=axes[plot_row][plot_column],
color=cmap(plot_column),
hist_kws=dict(alpha=1),
)
g.set_title(f"{col}")
g.set_yscale("log")
g.axes.set_xlabel("")
utils_save_plot(plt, REPORT_PATH / "buech2019-hmog-sensor-distribution.png")
Interpretation:
Sensor Data Distribution
for i, col in enumerate(FEATURE_COLS):
plt.figure(i, figsize=(20, 30))
g = sns.boxplot(
y="subject",
x=col,
data=hmog.all,
orient="h",
fliersize=2,
color="tab:blue",
saturation=1,
)
g.set_title(f'Distribution of "{col}" per Subject')
g.axes.set_xlabel("")
g.axes.xaxis.grid(True)
g.axes.yaxis.grid(True)
# Partial plot for use in thesis
subjects = [
"256487",
"257279",
"261313",
"264325",
"277905",
"278135",
"326223",
"336172",
"342329",
"352716",
]
df_temp = hmog.all[hmog.all["subject"].isin(subjects)].copy(deep=True)
df_temp["Subject"] = df_temp["subject"].cat.remove_unused_categories()
df_temp = df_temp.rename(columns={"mag_z": "$Mag_z$"})
plt.figure(dpi=180, figsize=(5.473, 2))
g = sns.boxplot(
y="Subject",
x="$Mag_z$",
data=df_temp,
orient="h",
**utils_boxplot_style
)
g.axes.xaxis.grid(True)
g.axes.yaxis.grid(True)
utils_save_plot(plt, REPORT_PATH / "buech2019-hmog-mag-outlier.png")
Sessions per Subjects
# Add session number as column
hmog.all["session_number"] = hmog.all["session"].str.rsplit("_", n=1).str[-1].astype("int")
hmog.all.head()
df_subjects = (
hmog.all.groupby("subject").agg({"session": ["nunique", "count"]}).reset_index()
)
df_subjects.columns = ["subject", "sessions", "samples"]
df_subjects["mean_min_per_session"] = df_subjects["samples"] / df_subjects["sessions"] / 100 / 60
df_subjects = df_subjects.sort_values("samples", ascending=False).reset_index(drop=True)
df_subjects.head(3)
plt.figure(dpi=180, figsize=(8, 2))
clrs = [
"tab:red" if (x in ("733162", "796581", "526319")) else "tab:blue"
for x in df_subjects["subject"]
]
sns.barplot(
x="subject",
y="sessions",
data=df_subjects,
palette=clrs,
order=df_subjects["subject"],
saturation=1,
)
plt.xticks(rotation=90, fontsize=4);
Samples per Subjects
plt.figure(dpi=180, figsize=(8, 2))
clrs = [
"tab:red" if (x in ("733162", "796581", "526319")) else "tab:blue"
for x in df_subjects["subject"]
]
sns.barplot(
x="subject",
y="samples",
data=df_subjects,
palette=clrs,
order=df_subjects["subject"],
saturation=1,
)
plt.xticks(rotation=90, fontsize=4);
# Partial plot for use in thesis
df_temp = df_subjects
df_temp = pd.concat([df_temp.head(14), df_temp.tail(18)])
df_temp["subject"] = df_temp["subject"].astype(str)
df_temp.loc[df_temp["subject"] == "771782", "samples"] = 0
df_temp.loc[df_temp["subject"] == "771782", "subject"] = "..."
plt.figure(dpi=180, figsize=(5.473, 2))
clrs = [
"tab:red" if (x in ("733162", "796581", "526319")) else "tab:blue"
for x in df_temp["subject"]
]
g = sns.barplot(
x="subject",
y="samples",
data=df_temp,
palette=clrs,
order=df_temp["subject"],
saturation=1,
)
g.set_ylabel("Samples")
g.set_xlabel("Subjects")
plt.xticks(rotation=90)
utils_save_plot(plt, REPORT_PATH / "buech2019-hmog-samples-dist.pdf")
Task Types per Subjects
# Add session task type as dummies
df_task_dummies = pd.get_dummies(hmog.index['task_type'])
dummy_columns = ["taskid_" + str(col) for col in df_task_dummies.columns]
df_task_dummies.columns = dummy_columns
df_task_types = pd.concat([hmog.index, df_task_dummies], axis=1)
df_tasks = (
df_task_types.groupby("subject")[dummy_columns].sum()
)
# Mapping according to hmog-docu:
df_tasks.columns = [
"read + sit",
"read + walk",
"write + sit",
"write + walk",
"map + sit",
"map + walk",
]
df_tasks.head()
fig = plt.figure(dpi=180, figsize=(7, 1.4))
g = df_tasks.plot(kind="bar", stacked=True, width=0.6, linewidth=0, ax=plt.gca())
g.legend(bbox_to_anchor=(0.5, -0.38), loc="upper center", ncol=6, fontsize=5)
g.tick_params(axis="both", which="major", pad=0)
g.set_xlabel("Subjects", fontsize=6)
g.set_ylabel("Sessions", fontsize=6)
plt.yticks(np.arange(0, 28, 4.0), fontsize=6)
plt.xticks(fontsize=4)
utils_save_plot(plt, REPORT_PATH / "buech2019-hmog-tasks-dist.pdf")
df_tasks[df_tasks < 4].dropna(axis=0, how="all")
Interpretation: The three subjects above have missing sessions (< 4) for certain task types and should be excluded.
aggs = ["mean", "std"]
df_session = hmog.all.groupby(["subject", "session_number"]).agg(
{
"acc_x": aggs,
"acc_y": aggs,
"acc_z": aggs,
"gyr_x": aggs,
"gyr_y": aggs,
"gyr_z": aggs,
"mag_x": aggs,
"mag_y": aggs,
"mag_z": aggs,
"task_type": ["median", "count"],
}
)
df_session.columns = ["_".join(col) for col in df_session.columns]
df_session = df_session.rename(columns={"task_type_count": "value_count"}).reset_index()
df_session["Session Duration in Minutes"] = df_session["value_count"] / 100 / 60
df_session["Samples per Session"] = df_session["value_count"]
df_session.head()
print(f"Histogramm of session duration, for all {len(df_session)} sessions")
mean_duration = df_session["Session Duration in Minutes"].mean()
fig = plt.figure(dpi=180, figsize=(5.473, 2))
g = sns.distplot(
df_session["Session Duration in Minutes"], kde=False, hist_kws=dict(alpha=1, lw=0.5)
)
g.set_xlabel("Session Duration in Minutes", fontsize=6)
plt.plot(
[mean_duration, mean_duration], [0, 220], linestyle="dashed", color=MAGENTA, lw=0.8
)
plt.text(
mean_duration + 0.5,
180,
"mean",
fontsize=6,
color=MAGENTA,
horizontalalignment="left",
)
plt.text(
mean_duration + 0.5,
165,
f"({mean_duration:.1f} min)",
fontsize=5,
color=MAGENTA,
horizontalalignment="left",
)
plt.xticks(fontsize=6)
plt.yticks(fontsize=6)
utils_save_plot(plt, REPORT_PATH / "buech2019-hmog-session-duration.pdf")
print(f"Histogramm of session samples, for all {len(df_session)} sessions")
fig = plt.figure(dpi=180, figsize=(5.473, 2))
g = sns.distplot(df_session["Samples per Session"], kde=False, hist_kws=dict(alpha=1))
g.set_xlabel("Samples per Session", fontsize=6)
plt.plot([24000, 24000], [0, 220], linestyle="dashed", color=MAGENTA, lw=0.8)
plt.text(21000, 170, "24000", fontsize=6, color=MAGENTA, horizontalalignment="right")
plt.text(21000, 155, "(4 min)", fontsize=5, color=MAGENTA, horizontalalignment="right")
plt.xticks(fontsize=6)
plt.yticks(fontsize=6)
utils_save_plot(plt, REPORT_PATH / "buech2019-hmog-session-samples.pdf")
df_session[["subject", "session_number", "Session Duration in Minutes"]].sort_values(
"Session Duration in Minutes"
).head(20)
g = sns.FacetGrid(df_session, col="subject", col_wrap=7, hue="task_type_median")
g = g.map(plt.bar, "session_number", "Session Duration in Minutes").fig.subplots_adjust(
wspace=0.15, hspace=0.15
)
df_session.head()
df_temp = df_session[
[
"acc_x_mean",
"acc_y_mean",
"acc_z_mean",
"gyr_x_mean",
"gyr_y_mean",
"gyr_z_mean",
"mag_x_mean",
"mag_y_mean",
"mag_z_mean",
"subject",
"task_type_median",
]
].reset_index()
df_temp = df_temp.reset_index(drop="True")
df_temp["scenario"] = np.where(
df_temp["task_type_median"].isin([2, 4, 6]), "walk", "sit"
)
df_temp["subject"] = df_temp["subject"].astype(str) + " ." # Workaround bug in seaborn
df_temp = df_temp.drop(columns=["task_type_median", "index"])
fig = plt.figure(dpi=180, figsize=(5.473, 5.473))
sns.pairplot(df_temp, hue="scenario", palette="tab10", plot_kws={"s": 15})
utils_save_plot(plt, REPORT_PATH / "buech2019-hmog-sit-walk.pdf")
df_temp = df_session[
[
"acc_x_mean",
"acc_y_mean",
"acc_z_mean",
"gyr_x_mean",
"gyr_y_mean",
"gyr_z_mean",
"mag_x_mean",
"mag_y_mean",
"mag_z_mean",
"subject",
]
].reset_index()
df_temp = df_temp.reset_index(drop=True)
df_temp["subject"] = df_temp["subject"].astype(str) + " ." # Workaround bug in seaborn
np.random.seed(SEED)
random_subjects = np.random.choice(df_temp["subject"].unique(), size=3)
df_temp = df_temp[df_temp["subject"].isin(list(random_subjects))]
df_temp = df_temp.drop(columns=["index"])
fig = plt.figure(dpi=180, figsize=(5.473, 5.473))
g = sns.pairplot(df_temp, hue="subject", palette="tab10", plot_kws={"s": 30})
utils_save_plot(plt, REPORT_PATH / "buech2019-hmog-three-subjects.pdf")
Combined partial plot for use in thesis
# Prepare Data
df_temp = df_session[
[
"acc_x_mean",
"acc_y_mean",
"acc_z_mean",
"task_type_median",
"subject",
]
].reset_index()
df_temp["Scenario"] = np.where(
df_temp["task_type_median"].isin([2, 4, 6]), "walk", "sit"
)
df_temp = df_temp.drop(columns=["task_type_median"])
df_temp = df_temp.rename(
columns={
"acc_x_mean": "$mean(Acc_x)$",
"acc_y_mean": "$mean(Acc_y)$",
"acc_z_mean": "$mean(Acc_z)$",
"subject": "Subject",
}
)
df_temp = df_temp.reset_index(drop=True)
df_temp["Subject"] = df_temp["Subject"].astype(str) + " ." # Workaround bug in seaborn
np.random.seed(SEED)
random_subjects = np.random.choice(df_temp["Subject"].unique(), size=3)
df_temp = df_temp[df_temp["Subject"].isin(list(random_subjects))]
df_temp = df_temp.drop(columns=["index"])
# Plot separately
plt.ioff()
g1 = sns.pairplot(df_temp, hue="Subject", palette=cmap.colors, plot_kws={"s": 15})
g1.fig.set_size_inches(6.5, 5)
g2 = sns.pairplot(df_temp, hue="Scenario", palette=cmap.colors[6:], plot_kws={"s": 15})
for ax in g2.axes.flatten():
ax.set_ylabel("")
g2.fig.set_size_inches(13, 5)
g2.fig.subplots_adjust(right=0.805)
g2.fig.subplots_adjust(left=0.455)
# Combine both plots
f = plt.figure(figsize=(14.3, 5))
for g in [g1, g2]:
g.fig.subplots_adjust(top=0.99, bottom=0.15)
for ax in g.fig.axes:
f._axstack.add(f._make_key(ax), ax)
custom_lines = [
Line2D([0], [0], color=cmap(0), marker="o", lw=0),
Line2D([0], [0], color=cmap(1), marker="o", lw=0),
Line2D([0], [0], color=cmap(2), marker="o", lw=0),
]
f.legend(
custom_lines,
["588087", "698266", "893255"],
title="Subjects",
loc="upper right",
handlelength=0.2,
bbox_to_anchor=(0.312, 0.5, 0.5, 0.5),
)
custom_lines = [
Line2D([0], [0], color=cmap(6), marker="o", lw=0),
Line2D([0], [0], color=cmap(7), marker="o", lw=0),
]
f.legend(
custom_lines,
["Walking ", "Sitting"],
title="Scenarios",
loc="upper right",
handlelength=0.2,
bbox_to_anchor=(0.319, 0.2, 0.5, 0.5),
)
utils_save_plot(plt, REPORT_PATH / "buech2019-hmog-session-means.pdf")